library(foreign)
library(haven)
library(tidyverse)
library(dplyr)
library(plyr)
library(readxl)

#############
##FUNCTIONS##
#############

getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

########################
##LOAD AND FORMAT DATA##
########################

#Set working directory to local data folder#

workingDirectory <- "ENTER FULL PATHNAME WHERE DATA ARE STORED HERE"

setwd(workingDirectory)

##EAC reports for vbm counts##
e08 <- read_excel("2008 eavs xls august 11 2010/County_Excel/Combined_SectionC.xls") %>%
  dplyr::select(STATE_, JurisName, JurisID, C1b) %>%
  dplyr::rename(stpost=STATE_, cntynm=JurisName, geoid=JurisID, vbm=C1b) %>%
  mutate(geoid=str_sub(geoid, 1, 5), year=2008) %>%
  group_by(year, stpost, geoid) %>%
  dplyr::summarize(vbm=sum(vbm, na.rm=T)) %>%
  filter(!stpost %in% c("AS","GU","PR","VI"), !is.na(vbm), vbm>0)
e12 <- read.dbf(unzip("DBF Files 2012.zip", "DBF Files/Section C_C1-C4.dbf"), as.is=T) %>%
  dplyr::select(STATE, JURISDICTI, FIPSCODE, QC1B) %>%
  dplyr::rename(stpost=STATE, cntynm=JURISDICTI, geoid=FIPSCODE, vbm=QC1B) %>%
  mutate(geoid=str_sub(geoid, 1, 5), year=2012, stpost=ifelse(stpost=="Oh", "OH", stpost)) %>%
  group_by(year, stpost, geoid) %>%
  dplyr::summarize(vbm=sum(vbm, na.rm=T)) %>%
  filter(!(stpost %in% c("AS","GU","PR","VI")))
e14 <- read.dbf(unzip("2014_EAVS_DBF_Files1.zip", "EAVS_Section_C.dbf"), as.is=T) %>%
  dplyr::select(STATE, JURISDICTI, FIPSCODE, QC1B) %>%
  dplyr::rename(stpost=STATE, cntynm=JURISDICTI, geoid=FIPSCODE, vbm=QC1B) %>%
  mutate(geoid=str_sub(geoid, 1, 5), year=2014) %>%
  group_by(year, stpost, geoid) %>%
  dplyr::summarize(vbm=sum(vbm, na.rm=T)) %>%
  filter(!(stpost %in% c("AS","GU","PR","VI")))
e16 <- read_csv(unzip("EAVS_2016_Final_Data_for_Public_Release_v2.csv.zip", "EAVS 2016 Final Data for Public Release v.3.csv")) %>%
  dplyr::select(State, JurisdictionName, FIPSCode, C1b) %>%
  dplyr::rename(stpost=State, cntynm=JurisdictionName, geoid=FIPSCode, vbm=C1b) %>%
  mutate(geoid=as.character(format(geoid, scientific=F))) %>%
  mutate(geoid=str_trim(geoid)) %>%
  mutate(geoid=str_pad(str_sub(as.character(geoid), 1, -6), width=5, side="left", pad="0"), 
         vbm=parse_number(vbm), year=2016) %>%
  group_by(year, stpost, geoid) %>%
  dplyr::summarize(vbm=sum(vbm, na.rm=T)) %>%
  filter(!(stpost %in% c("AS","GU","PR","VI","WI")))
e18 <- read_csv("EAVS_2018_for_Public_Release_Updates3.csv") %>%
  dplyr::select(State_Abbr, Jurisdiction_Name, FIPSCode, C1b) %>%
  dplyr::rename(stpost=State_Abbr, cntynm=Jurisdiction_Name, geoid=FIPSCode, vbm=C1b) %>%
  mutate(geoid=str_sub(geoid, 1, 5), vbm=parse_number(vbm), year=2018) %>%
  group_by(year, stpost, geoid) %>%
  dplyr::summarize(vbm=sum(vbm, na.rm=T)) %>%
  filter(!(stpost %in% c("AS","GU","PR","VI","WI")))

#combining eavs data for each year#
#wi_vbm.csv contains vbm data for WI which is missing from the EAVS files#
e <- rbind.fill(e08, e12, e14, e16, e18, read_csv("wi_vbm.csv") %>% dplyr::select(-cntynm))

##historical policies##
p <- read_csv("context_data_1992_2020_latest.csv")[,-1] %>%
  mutate(vbm_all=ifelse(stpost=="CA" & cntynm=="Alpine" & year>=1996, 1, vbm_all),
         vbm_all=ifelse(stpost=="CA" & cntynm=="Plumas" & year>=2016, 1, vbm_all),
         vbm_noex=ifelse(stpost=="KS" & year==2020, 1, vbm_noex),
         stfips=str_pad(stfips, side="left", width=2, pad="0"),
         cntyfips=str_pad(cntyfips, side="left", width=3, pad="0"),
         geoid=str_pad(geoid, side="left", width=5, pad="0")) %>%
  filter(!(statenm=="Minnesota" & year==1992)) #no info on which counties experimented with no excuse VBM that year

#election day registration plus additions based on 
#https://www.ncsl.org/research/elections-and-campaigns/same-day-registration.aspx
#and media searches to identify when first went into effect
edr <- read_dta("edr_onlinereg_1978_2018.dta") %>%
  rbind.fill(filter(., year==2018) %>% mutate(year=2020)) %>%
  mutate(edr2=ifelse(statenm=="Colorado" & year>2012, 1, edr2),
         edr2=ifelse(statenm=="Connecticut" & year>2012, 1, edr2),
         edr2=ifelse(statenm=="District of Columbia" & year>2010, 1, edr2),
         edr2=ifelse(statenm=="Hawaii" & year>2016, 1, edr2),
         edr2=ifelse(statenm=="Illinois" & year>2014, 1, edr2),
         edr2=ifelse(statenm=="Maryland" & year>2018, 1, edr2),
         edr2=ifelse(statenm=="Michigan" & year>2018, 1, edr2),
         edr2=ifelse(statenm=="Nevada" & year>2019, 1, edr2),
         edr2=ifelse(statenm=="Utah" & year>2016, 1, edr2),
         edr2=ifelse(statenm=="Vermont" & year>2017, 1, edr2),
         edr2=ifelse(statenm=="Washington" & year>2018, 1, edr2),
         stfips=str_pad(as.character(fipsst), side="left", width=2, pad="0")) %>%
  filter(statenm!="United States") %>%
  dplyr::select(year, statenm, stfips, edr2) %>%
  left_join(read_csv("onreg_implementation.csv")) %>%
  mutate(first_onreg_year=ifelse(is.na(first_onreg_year), 2050, first_onreg_year),
         onreg=as.integer(year>=first_onreg_year),
         edr=edr2) %>%
  dplyr::select(-edr2, -first_onreg_year)

p <- left_join(p, edr) %>%
  mutate(onreg=ifelse(statenm=="Missouri" & year>=2014 & cntynm=="Boone", 1, onreg),
         cntynm=str_replace(cntynm, " city", "")) %>%
  mutate(cntynm=str_replace(cntynm, " Parish", ""))

##David Leip Atlas presidential vote data##
d92 <- read_excel(unzip("Pres_Election_Data_1992.xlsx.zip", "Pres_Election_Data_1992.xlsx"), sheet="County")[-1,]
d92 <- d92[,c(1:3,10:12,14:16,53:55)]
names(d92) <- c("cntynm","stpost","totvote","pcdemvt","pcrepvt","pcperotvt","demvt","repvt","perotvt","stfips","cntyfips","geoid")
d92 <- filter(d92, !is.na(cntynm), stpost!="AK") %>%
  mutate(stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         stfips=str_pad(stfips, side="left", width=2, pad="0"),
         cntyfips=ifelse(cntynm=="York" & stpost=="NE", 187, cntyfips)) %>%
  mutate(cntyfips=cntyfips-2, geoid=geoid-2) %>%
  mutate(cntyfips=str_pad(cntyfips, side="left", width=3, pad="0"),
         geoid=str_pad(geoid, side="left", width=5, pad="0"),
         year=1992) %>%
  mutate(geoid=dplyr::recode(geoid, "24003"="24510",
                             "24007"="24005",
                             "51608"="51600",
                             "51628"="51620",
                             "51768"="51760",
                             "51518"="51515",
                             "51773"="51770")) %>%
  filter(stpost!="T")

d96 <- read_excel(unzip("Pres_Election_Data_1996.xlsx.zip", "Pres_Election_Data_1996.xlsx"), sheet="County")[-1,]
d96 <- d96[,c(1:3,10:12,14:16,51:53)]
names(d96) <- c("cntynm","stpost","totvote","pcdemvt","pcrepvt","pcperotvt","demvt","repvt","perotvt","stfips","cntyfips","geoid")
d96 <- filter(d96, !is.na(cntynm), stpost!="AK") %>%
  mutate(stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         stfips=str_pad(stfips, side="left", width=2, pad="0"),
         cntyfips=str_pad(cntyfips, side="left", width=3, pad="0"),
         geoid=str_pad(geoid, side="left", width=5, pad="0"),
         year=1996) %>%
  filter(stpost!="T")

d00 <- read_excel(unzip("Pres_Election_Data_2000.xlsx.zip", "Pres_Election_Data_2000.xlsx"), sheet="County")[-1,]
d00 <- d00[,c(1:3,10:11,14:15,46:48)]
names(d00) <- c("cntynm","stpost","totvote","pcdemvt","pcrepvt","demvt","repvt","stfips","cntyfips","geoid")
d00 <- filter(d00, !is.na(cntynm), stpost!="AK") %>%
  mutate(stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         year=2000) %>%
  filter(stpost!="T")

d04 <- read_excel(unzip("Pres_Election_Data_2004.xlsx.zip", "Pres_Election_Data_2004.xlsx"), sheet="County")[-1,]
d04 <- d04[,c(1:3,10:11,14:15,46:48)]
names(d04) <- c("cntynm","stpost","totvote","pcdemvt","pcrepvt","demvt","repvt","stfips","cntyfips","geoid")
d04 <- filter(d04, !is.na(cntynm), stpost!="AK") %>%
  mutate(stpost=ifelse(cntynm %in% "Alaska", "AK", stpost),
         geoid=ifelse(cntynm %in% "Alaska", "00002", geoid),
         year=2004) %>%
  filter(stpost!="T")

d08 <- read_excel(unzip("Pres_Election_Data_2008.xlsx.zip", "Pres_Election_Data_2008.xlsx"), sheet="County")[-1,]
d08 <- d08[,c(1:3,10:11,14:15,53:55)]
names(d08) <- c("cntynm","stpost","totvote","pcdemvt","pcrepvt","demvt","repvt","stfips","cntyfips","geoid")
d08 <- filter(d08, !is.na(cntynm), stpost!="AK") %>%
  mutate(stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         stfips=str_pad(stfips, side="left", width=2, pad="0"),
         cntyfips=str_pad(cntyfips, side="left", width=3, pad="0"),
         geoid=str_pad(geoid, side="left", width=5, pad="0"),
         stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         year=2008) %>%
  filter(stpost!="T")

d12 <- read_excel(unzip("Pres_Election_Data_2012.xlsx.zip", "Pres_Election_Data_2012.xlsx"), sheet="County")[-1,]
d12 <- d12[,c(1:3,10:11,14:15,58:60)]
names(d12) <- c("cntynm","stpost","totvote","pcdemvt","pcrepvt","demvt","repvt","stfips","cntyfips","geoid")
d12 <- filter(d12, !is.na(cntynm), stpost!="AK") %>%
  mutate(stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         stfips=str_pad(stfips, side="left", width=2, pad="0"),
         cntyfips=str_pad(cntyfips, side="left", width=3, pad="0"),
         geoid=str_pad(geoid, side="left", width=5, pad="0"),
         stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         year=2012) %>%
  filter(stpost!="T")

d16 <- read_excel("Pres_Election_Data_2016.xlsx", sheet="County")[-1,]
d16 <- d16[,c(1:3,10:11,14:15,62:64)]
names(d16) <- c("cntynm","stpost","totvote","pcdemvt","pcrepvt","demvt","repvt","stfips","cntyfips","geoid")
d16 <- filter(d16, !is.na(cntynm), stpost!="AK") %>%
  mutate(stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         stfips=str_pad(stfips, side="left", width=2, pad="0"),
         cntyfips=str_pad(cntyfips, side="left", width=3, pad="0"),
         geoid=str_pad(geoid, side="left", width=5, pad="0"),
         stpost=ifelse(cntynm=="Alaska", "AK", stpost),
         year=2016) %>%
  filter(stpost!="T")

#most recent data from New York Times#
d20 <-  read_csv("pres_vote_county_2020_nyt.csv") %>%
  dplyr::select(cntyfips, cntynm, tvt, trumpd, bidenj) %>%
  dplyr::rename(geoid=cntyfips, totvote=tvt, demvt=bidenj, repvt=trumpd) %>%
  mutate(stfips=str_sub(geoid, 1,2),
         cntyfips=str_sub(geoid, 3,5),
         geoid=ifelse(stfips=="02", "00002", geoid),
         cntyfips=ifelse(stfips=="02", "000", cntyfips),
         cntynm=ifelse(stfips=="02", "Alaska", cntynm)) %>%
  group_by(stfips, cntynm, cntyfips, geoid) %>%
  dplyr::summarize(totvote=sum(totvote), demvt=sum(demvt), repvt=sum(repvt)) %>%
  mutate(pcdemvt=demvt/totvote, pcrepvt=repvt/totvote, 
         geoid=ifelse(cntynm %in% "Yuma" & stfips=="08", "08125", geoid),
         geoid=ifelse(cntynm %in% "Yuma" & stfips=="04", "04027", geoid),
         year=2020) %>%
  left_join(read_dta("icpsrcodes.dta") %>% mutate(stfips=str_pad(fipsst, side="left", width=2, pad="0"))) %>%
  dplyr::select(-c(jacstate, icpsrst, fipsst, statenm))

#combine the files#
d <- rbind.fill(d92,d96,d00,d04,d08,d12,d16,d20) %>%
  left_join(dplyr::select(p, -cntynm, -stpost), by=c("year","stfips","cntyfips","geoid"), 
            suffix=c("_votes","")) %>%
  mutate(cntynm=ifelse(cntynm=="Dade" & stpost=="FL", "Miami-Dade", cntynm)) %>% #misc cleanup
  mutate(cntynm=ifelse(cntynm=="Dona Ana", "Do?a Ana", cntynm)) %>%
  mutate(cntynm=ifelse(cntynm=="Lac Qui Parle", "Lac qui Parle", cntynm)) %>%
  mutate(cntynm=ifelse(cntynm=="Dekalb" & stpost=="IN", "DeKalb", cntynm)) %>%
  mutate(cntynm=ifelse(cntynm=="Dewitt" & stpost=="TX", "DeWitt", cntynm)) %>%
  mutate(cntynm=ifelse(cntynm=="La Salle" & stpost=="LA", "LaSalle", cntynm)) %>%
  mutate(cntynm=ifelse(cntynm=="St. Louis" & cntyfips=="510", "St. Louis City", cntynm)) %>%
  mutate(cntynm=ifelse(geoid=="51515", "Bedford City", cntynm)) %>%
  mutate(cntynm=ifelse(geoid=="51600", "Fairfax City", cntynm)) %>%
  mutate(cntynm=ifelse(geoid=="51620", "Franklin City", cntynm)) %>%
  mutate(cntynm=ifelse(geoid=="51760", "Richmond City", cntynm)) %>%
  mutate(cntynm=ifelse(geoid=="51770", "Roanoke City", cntynm)) %>%
  mutate(cntynm=ifelse(geoid=="24510", "Baltimore City", cntynm)) %>%
  ddply(.(stpost, cntynm), transform, geoid=getmode(geoid)) %>%
  ddply(.(stpost), transform, statenm=getmode(statenm)) %>%
  mutate(dpvote=demvt/(demvt+repvt),
         yr1992=year==1992,
         yr1996=year==1996,
         yr2000=year==2000,
         yr2004=year==2004,
         yr2008=year==2008,
         yr2012=year==2012,
         yr2016=year==2016,
         yr2020=year==2020,
         cases_mn_centered=cases_mn - mean(cases_mn, na.rm=T),
         deaths_mn_centered=deaths_mn - mean(deaths_mn, na.rm=T),
         cases_sum_centered=cases_sum - mean(cases_sum, na.rm=T),
         deaths_sum_centered=deaths_sum - mean(deaths_sum, na.rm=T),
         avr=ifelse(statenm %in% c("Georgia","Oregon") & year<2016, 0, avr),
         avr=ifelse(statenm %in% "Delaware" & year<=2008, 0, avr),
         yrcount=as.integer(as.factor(year))-1) %>%
  filter(!is.na(geoid), !is.na(dpvote))

##identify earliest dates of reforms in each state and county##
#universal vote-by-mail (plus vbm % prior to switch from turnout syntax)#
switch_vbm_all <- read_csv("pcvbm.csv")[,-1] %>%
  dplyr::select(geoid, pcvbm, first_yr_vbm_all)

#no excuse vote-by-mail#
switch_vbm_noex <- filter(p, vbm_noex==1) %>%
  group_by(geoid) %>%
  filter(year==min(year)) %>%
  dplyr::select(geoid, year) %>%
  dplyr::rename(first_yr_vbm_noex=year)

#universal vote-by-mail applications#
switch_vbm_app <- filter(p, vbm_app==1) %>%
  group_by(geoid) %>%
  filter(year==min(year)) %>%
  dplyr::select(geoid, year) %>%
  dplyr::rename(first_yr_vbm_app=year)

#permanent vote-by-mail list#
switch_vbm_perm <- read_csv("switch_vbm_perm.csv")

#automatic voter registration#
switch_avr <- filter(p, avr==1) %>%
  group_by(geoid) %>%
  filter(year==min(year)) %>%
  dplyr::select(geoid, year) %>%
  dplyr::rename(first_yr_avr=year)

#election day registration#
switch_edr <- filter(p, edr==1) %>%
  group_by(geoid) %>%
  filter(year==min(year)) %>%
  dplyr::select(geoid, year) %>%
  dplyr::rename(first_yr_edr=year)

#early voting#
switch_early <- filter(p, early==1) %>%
  group_by(geoid) %>%
  filter(year==min(year)) %>%
  dplyr::select(geoid, year) %>%
  dplyr::rename(first_yr_early=year)

#create lag and lead variables for event studies#
d <- left_join(d, switch_vbm_all) %>%
  left_join(switch_vbm_app) %>%
  left_join(switch_vbm_noex) %>%
  left_join(switch_vbm_perm) %>%
  left_join(switch_avr) %>%
  left_join(switch_edr) %>%
  left_join(switch_early) %>%
  mutate(pcvbm=ifelse(vbm_all==1, pcvbm, 0),
         cycles_to_vbm_all=ifelse(is.na(first_yr_vbm_all), -1, (year-first_yr_vbm_all)/4),
         cycles_to_vbm_app=ifelse(is.na(first_yr_vbm_app), -1, (year-first_yr_vbm_app)/4),
         cycles_to_vbm_noex=ifelse(is.na(first_yr_vbm_noex), -1, (year-first_yr_vbm_noex)/4),
         cycles_to_vbm_perm=ifelse(is.na(first_yr_vbm_perm), -1, (year-first_yr_vbm_perm)/4),
         cycles_to_avr=ifelse(is.na(first_yr_avr), -1, (year-first_yr_avr)/4),
         cycles_to_edr=ifelse(is.na(first_yr_edr), -1, (year-first_yr_edr)/4),
         cycles_to_early=ifelse(is.na(first_yr_early), -1, (year-first_yr_early)/4)) %>%
  mutate(vbm_perm=as.integer(cycles_to_vbm_perm>=0),
         vbm_all_m3=cycles_to_vbm_all<=-3,
         vbm_all_m2=cycles_to_vbm_all==-2,
         vbm_all_m1=cycles_to_vbm_all==-1,
         vbm_all_p0=cycles_to_vbm_all==0,
         vbm_all_p1=cycles_to_vbm_all==1,
         vbm_all_p2=cycles_to_vbm_all>=2,
         vbm_noex_m3=cycles_to_vbm_noex<=-3,
         vbm_noex_m2=cycles_to_vbm_noex==-2,
         vbm_noex_m1=cycles_to_vbm_noex==-1,
         vbm_noex_p0=cycles_to_vbm_noex==0,
         vbm_noex_p1=cycles_to_vbm_noex==1,
         vbm_noex_p2=cycles_to_vbm_noex>=2,
         vbm_perm_m3=cycles_to_vbm_perm<=-3,
         vbm_perm_m2=cycles_to_vbm_perm==-2,
         vbm_perm_m1=cycles_to_vbm_perm==-1,
         vbm_perm_p0=cycles_to_vbm_perm==0,
         vbm_perm_p1=cycles_to_vbm_perm==1,
         vbm_perm_p2=cycles_to_vbm_perm>=2,
         vbm_app_m3=cycles_to_vbm_app<=-3,
         vbm_app_m2=cycles_to_vbm_app==-2,
         vbm_app_m1=cycles_to_vbm_app==-1,
         vbm_app_p0=cycles_to_vbm_app>=0,
         avr_m3=cycles_to_avr<=-3,
         avr_m2=cycles_to_avr==-2,
         avr_m1=cycles_to_avr==-1,
         avr_p0=cycles_to_avr==0,
         avr_p1=cycles_to_avr>=1,
         edr_m3=cycles_to_edr<=-3,
         edr_m2=cycles_to_edr==-2,
         edr_m1=cycles_to_edr==-1,
         edr_p0=cycles_to_edr==0,
         edr_p1=cycles_to_edr==1,
         edr_p2=cycles_to_edr>=2,
         early_m3=cycles_to_early<=-3,
         early_m2=cycles_to_early==-2,
         early_m1=cycles_to_early==-1,
         early_p0=cycles_to_early==0,
         early_p1=cycles_to_early==1,
         early_p2=cycles_to_early>=2) %>%
  mutate(vbm_perm=ifelse(vbm_all==1, 1, vbm_perm)) %>%
  filter(cntynm!="Overseas")

write.csv(d, "pandemic_election_formatted_data_pvote.csv")

